## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
## ggtree v3.12.0 For help: https://yulab-smu.top/treedata-book/
##
## If you use the ggtree package suite in published research, please cite
## the appropriate paper(s):
##
## Guangchuang Yu, David Smith, Huachen Zhu, Yi Guan, Tommy Tsan-Yuk Lam.
## ggtree: an R package for visualization and annotation of phylogenetic
## trees with their covariates and other associated data. Methods in
## Ecology and Evolution. 2017, 8(1):28-36. doi:10.1111/2041-210X.12628
##
## Guangchuang Yu. Data Integration, Manipulation and Visualization of
## Phylogenetic Trees (1st edition). Chapman and Hall/CRC. 2022,
## doi:10.1201/9781003279242
##
## G Yu. Data Integration, Manipulation and Visualization of Phylogenetic
## Trees (1st ed.). Chapman and Hall/CRC. 2022. ISBN: 9781032233574
##
## Attaching package: 'ggtree'
##
## The following object is masked from 'package:tidyr':
##
## expand
library(TDbook) #A Companion Package for the Book "Data Integration, Manipulation and Visualization of Phylogenetic Trees" by Guangchuang Yu (2022, ISBN:9781032233574).
library(ggimage)
library(rphylopic)## You are using rphylopic v.1.4.0. Please remember to credit PhyloPic contributors (hint: `get_attribution()`) and cite rphylopic in your work (hint: `citation("rphylopic")`).
##
## Attaching package: 'rphylopic'
##
## The following object is masked from 'package:ggimage':
##
## geom_phylopic
## treeio v1.28.0 For help: https://yulab-smu.top/treedata-book/
##
## If you use the ggtree package suite in published research, please cite
## the appropriate paper(s):
##
## LG Wang, TTY Lam, S Xu, Z Dai, L Zhou, T Feng, P Guo, CW Dunn, BR
## Jones, T Bradley, H Zhu, Y Guan, Y Jiang, G Yu. treeio: an R package
## for phylogenetic tree input and output with richly annotated and
## associated data. Molecular Biology and Evolution. 2020, 37(2):599-603.
## doi: 10.1093/molbev/msz240
##
## G Yu. Data Integration, Manipulation and Visualization of Phylogenetic
## Trees (1st ed.). Chapman and Hall/CRC. 2022. ISBN: 9781032233574
##
## Guangchuang Yu. Data Integration, Manipulation and Visualization of
## Phylogenetic Trees (1st edition). Chapman and Hall/CRC. 2022,
## doi:10.1201/9781003279242
## If you use the ggtree package suite in published research, please cite
## the appropriate paper(s):
##
## Guangchuang Yu, David Smith, Huachen Zhu, Yi Guan, Tommy Tsan-Yuk Lam.
## ggtree: an R package for visualization and annotation of phylogenetic
## trees with their covariates and other associated data. Methods in
## Ecology and Evolution. 2017, 8(1):28-36. doi:10.1111/2041-210X.12628
##
## Guangchuang Yu. Using ggtree to visualize data on tree-like structures.
## Current Protocols in Bioinformatics. 2020, 69:e96. doi:10.1002/cpbi.96
##
## Attaching package: 'tidytree'
##
## The following object is masked from 'package:treeio':
##
## getNodeNum
##
## The following object is masked from 'package:stats':
##
## filter
##
## Attaching package: 'ape'
##
## The following objects are masked from 'package:tidytree':
##
## drop.tip, keep.tip
##
## The following object is masked from 'package:treeio':
##
## drop.tip
##
## The following object is masked from 'package:ggtree':
##
## rotate
##
## The following object is masked from 'package:dplyr':
##
## where
##
## Attaching package: 'TreeTools'
##
## The following object is masked from 'package:tidytree':
##
## MRCA
##
## The following object is masked from 'package:treeio':
##
## MRCA
##
## The following object is masked from 'package:ggtree':
##
## MRCA
## Loading required package: maps
##
## Attaching package: 'maps'
##
## The following object is masked from 'package:purrr':
##
## map
##
##
## Attaching package: 'phytools'
##
## The following object is masked from 'package:TreeTools':
##
## as.multiPhylo
##
## The following object is masked from 'package:treeio':
##
## read.newick
## ggtreeExtra v1.14.0 For help: https://yulab-smu.top/treedata-book/
##
## If you use the ggtree package suite in published research, please cite
## the appropriate paper(s):
##
## S Xu, Z Dai, P Guo, X Fu, S Liu, L Zhou, W Tang, T Feng, M Chen, L
## Zhan, T Wu, E Hu, Y Jiang, X Bo, G Yu. ggtreeExtra: Compact
## visualization of richly annotated phylogenetic data. Molecular Biology
## and Evolution. 2021, 38(9):4039-4042. doi: 10.1093/molbev/msab166
NEON_MAGs <- read_csv("data/NEON/GOLD_Study_ID_Gs0161344_NEON_2024_4_21.csv") %>%
# remove columns that are not needed for data analysis
select(-c(`GOLD Study ID`, `Bin Methods`, `Created By`, `Date Added`, `Bin Lineage`)) %>%
# create a new column with the Assembly Type
mutate("Assembly Type" = case_when(`Genome Name` == "NEON combined assembly" ~ `Genome Name`,
TRUE ~ "Individual")) %>%
mutate_at("Assembly Type", str_replace, "NEON combined assembly", "Combined") %>%
mutate_at("GTDB-Tk Taxonomy Lineage", str_replace, "d__", "") %>%
mutate_at("GTDB-Tk Taxonomy Lineage", str_replace, "p__", "") %>%
mutate_at("GTDB-Tk Taxonomy Lineage", str_replace, "c__", "") %>%
mutate_at("GTDB-Tk Taxonomy Lineage", str_replace, "o__", "") %>%
mutate_at("GTDB-Tk Taxonomy Lineage", str_replace, "f__", "") %>%
mutate_at("GTDB-Tk Taxonomy Lineage", str_replace, "g__", "") %>%
mutate_at("GTDB-Tk Taxonomy Lineage", str_replace, "s__", "") %>%
separate(`GTDB-Tk Taxonomy Lineage`, c("Domain", "Phylum", "Class", "Order", "Family", "Genus", "Species"), ";", remove = FALSE) %>%
mutate_at("Domain", na_if,"") %>%
mutate_at("Phylum", na_if,"") %>%
mutate_at("Class", na_if,"") %>%
mutate_at("Order", na_if,"") %>%
mutate_at("Family", na_if,"") %>%
mutate_at("Genus", na_if,"") %>%
mutate_at("Species", na_if,"") %>%
# Get rid of the the common string "Soil microbial communities from "
mutate_at("Genome Name", str_replace, "Terrestrial soil microbial communities from ", "") %>%
# Use the first `-` to split the column in two
separate(`Genome Name`, c("Site","Sample Name"), " - ") %>%
# Get rid of the the common string "S-comp-1"
mutate_at("Sample Name", str_replace, "-comp-1", "") %>%
# separate the Sample Name into Site ID and plot info
separate(`Sample Name`, c("Site ID","subplot.layer.date"), "_", remove = FALSE,) %>%
# separate the plot info into 3 columns
separate(`subplot.layer.date`, c("Subplot", "Layer", "Date"), "-")## Rows: 1754 Columns: 19
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (8): Bin ID, Genome Name, Bin Quality, Bin Lineage, GTDB-Tk Taxonomy L...
## dbl (10): IMG Genome ID, Bin Completeness, Bin Contamination, Total Number ...
## date (1): Date Added
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 624 rows [1131, 1132,
## 1133, 1134, 1135, 1136, 1137, 1138, 1139, 1140, 1141, 1142, 1143, 1144, 1145,
## 1146, 1147, 1148, 1149, 1150, ...].
NEON_metagenomes <- read_tsv("data/NEON/exported_img_data_Gs0161344_NEON.tsv") %>%
select(-c(`Domain`, `Sequencing Status`, `Sequencing Center`)) %>%
rename(`Genome Name` = `Genome Name / Sample Name`) %>%
filter(str_detect(`Genome Name`, 're-annotation', negate = T)) %>%
filter(str_detect(`Genome Name`, 'WREF plot', negate = T))## Rows: 176 Columns: 46
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (18): Domain, Sequencing Status, Study Name, Genome Name / Sample Name, ...
## dbl (16): taxon_oid, IMG Genome ID, Depth In Meters, Elevation In Meters, Ge...
## lgl (12): Altitude In Meters, Chlorophyll Concentration, Longhurst Code, Lon...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
NEON_metagenomes <- NEON_metagenomes %>%
# Get rid of the the common string "Soil microbial communities from "
mutate_at("Genome Name", str_replace, "Terrestrial soil microbial communities from ", "") %>%
# Use the first `-` to split the column in two
separate(`Genome Name`, c("Site","Sample Name"), " - ") %>%
# Get rid of the the common string "-comp-1"
mutate_at("Sample Name", str_replace, "-comp-1", "") %>%
# separate the Sample Name into Site ID and plot info
separate(`Sample Name`, c("Site ID","subplot.layer.date"), "_", remove = FALSE,) %>%
# separate the plot info into 3 columns
separate(`subplot.layer.date`, c("Subplot", "Layer", "Date"), "-")## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 1 rows [53].
NEON_chemistry <- read_tsv("data/NEON/neon_plot_soilChem1_metadata.tsv") %>%
# remove -COMP from genomicsSampleID
mutate_at("genomicsSampleID", str_replace, "-COMP", "")## Rows: 87 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (5): genomicsSampleID, siteID, plotID, nlcdClass, horizon
## dbl (11): decimalLatitude, decimalLongitude, elevation, soilTemp, d15N, org...
## date (1): collectionDate
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
NEON_MAGs_metagenomes_chemistry <- NEON_MAGs %>%
left_join(NEON_metagenomes, by = "Sample Name") %>%
left_join(NEON_chemistry, by = c("Sample Name" = "genomicsSampleID")) %>%
rename("label" = "Bin ID")NEON_MAGs_metagenomes_chemistry_noblank <- NEON_MAGs_metagenomes_chemistry %>%
rename("AssemblyType" = "Assembly Type") %>%
rename("BinCompleteness" = "Bin Completeness") %>%
rename("BinContamination" = "Bin Contamination") %>%
rename("TotalNumberofBases" = "Total Number of Bases") %>%
rename("EcosystemSubtype" = "Ecosystem Subtype")tree_arc <- read.tree("data/NEON/gtdbtk.ar53.decorated.tree")
tree_bac <- read.tree("data/NEON/gtdbtk.bac120.decorated.tree")node_vector_bac = c(tree_bac$tip.label,tree_bac$node.label)
grep("Bacteroidota", node_vector_bac, value = TRUE)## [1] "'1.0:p__Bacteroidota'"
## [1] 2507
##Konza
NEON_MAGs_bact_ind %>%
filter(is.na(Class) | is.na(Order) | is.na(Domain) | is.na(Phylum) | is.na(Family) | is.na(Genus)) %>%
ggplot(aes(x = fct_infreq(`Site`))) +
geom_bar() +
coord_flip() +
labs(title = 'Novel Bacteria by Site')NEON_MAGs_bact_co %>%
filter(is.na(Class) | is.na(Order) | is.na(Domain) | is.na(Phylum) | is.na(Family) | is.na(Genus)) %>%
ggplot(aes(x = fct_infreq(`Site`))) +
geom_bar() +
coord_flip() +
labs(title = 'Novel Bacteria by Site')NEON_MAGs_bact_ind %>%
ggplot(aes(x = fct_rev(fct_infreq(Site)), fill = Phylum)) +
geom_bar() +
coord_flip() +
labs(title = 'Total MAGs at Each Site')NEON_MAGs_bact_ind2 %>%
ggplot(aes(x = Phylum, fill = Subplot)) +
geom_bar(position = position_dodge2(width = 0.9, preserve = "single")) +
labs(title = 'MAG Count for each Phylum at the KONZA Site')+
coord_flip()Chemical distribution of the site. Konza
ggplotly(
ggplot(data= NEON_Bact ,aes(x = fct_infreq(`Site ID.x`), y = soilTemp)) +
geom_point(aes(color= Family))+
geom_boxplot() +
theme(axis.text.x = element_text(angle=45, vjust=1, hjust=1))
)## Warning: Removed 11 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
ggplotly(
ggplot(data= NEON_full ,aes(x = fct_infreq(`Site ID.x`), y = soilTemp)) +
geom_point(aes(color= Phylum))+
geom_boxplot() +
theme(axis.text.x = element_text(angle=45, vjust=1, hjust=1))
)## Warning: Removed 625 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
NEON_full %>%
ggplot(aes(x = CNratio, y = Family)) +
labs(title = "New Graph - Josh") +
geom_point(aes(color='Family')) +
coord_flip()## Warning: Removed 1678 rows containing missing values or values outside the scale range
## (`geom_point()`).
ggtree(tree_Bacteroidota, layout="circular", branch.length="none") %<+%
NEON_MAGs_metagenomes_chemistry +
geom_point2(mapping=aes(color=`Ecosystem Subtype`, size=`Total Number of Bases`)) +
new_scale_fill() +
geom_fruit(
data=NEON_MAGs_metagenomes_chemistry_noblank,
geom=geom_tile,
mapping=aes(y=label, x=1, fill= AssemblyType),
offset=0.08, # The distance between external layers, default is 0.03 times of x range of tree.
pwidth=0.25 # width of the external layer, default is 0.2 times of x range of tree.
)## ! The following column names/name: Site.x, Sample Name, Site ID.x, Subplot.x, Layer.x, Date.x, IMG Genome ID.x, Bin Quality, GTDB-Tk Taxonomy Lineage, Domain, Phylum, Class, Order, Family, Genus, Species, 5s rRNA, 16s rRNA, 23s rRNA, tRNA Genes, Gene Count, Scaffold Count, taxon_oid, Study Name, Site.y, Site ID.y, Subplot.y, Layer.y, Date.y, IMG Genome ID.y, GOLD Study ID, Ecosystem, Ecosystem Category, Ecosystem Type, Specific Ecosystem, Altitude In Meters, Chlorophyll Concentration, Depth In Meters, Elevation In Meters, Geographic Location, Habitat, Isolation, Isolation Country, Latitude, Longhurst Code, Longhurst Description, Longitude, Nitrate Concentration, Oxygen Concentration, pH, Pressure, Salinity, Salinity Concentration, Sample Collection Date, Sample Collection Temperature, Subsurface In Meters, Genome Size * assembled, Gene Count * assembled, Scaffold Count * assembled, Genome MetaBAT Bin Count * assembled, Genome EukCC Bin Count * assembled, CRISPR Count * assembled, GC Count * assembled, GC * assembled, Coding Base Count * assembled, Coding Base Count % * assembled, CDS Count * assembled, CDS % * assembled, siteID, plotID, nlcdClass, decimalLatitude, decimalLongitude, elevation, collectionDate, horizon, soilTemp, d15N, organicd13C, nitrogenPercent, organicCPercent, CNratio, soilInWaterpH, soilInCaClpH are/is the same to tree data, the tree data column names are : label, y, angle, Site.x, Sample Name, Site ID.x, Subplot.x, Layer.x, Date.x, IMG Genome ID.x, Bin Quality, GTDB-Tk Taxonomy Lineage, Domain, Phylum, Class, Order, Family, Genus, Species, Bin Completeness, Bin Contamination, Total Number of Bases, 5s rRNA, 16s rRNA, 23s rRNA, tRNA Genes, Gene Count, Scaffold Count, Assembly Type, taxon_oid, Study Name, Site.y, Site ID.y, Subplot.y, Layer.y, Date.y, IMG Genome ID.y, GOLD Study ID, Ecosystem, Ecosystem Category, Ecosystem Subtype, Ecosystem Type, Specific Ecosystem, Altitude In Meters, Chlorophyll Concentration, Depth In Meters, Elevation In Meters, Geographic Location, Habitat, Isolation, Isolation Country, Latitude, Longhurst Code, Longhurst Description, Longitude, Nitrate Concentration, Oxygen Concentration, pH, Pressure, Salinity, Salinity Concentration, Sample Collection Date, Sample Collection Temperature, Subsurface In Meters, Genome Size * assembled, Gene Count * assembled, Scaffold Count * assembled, Genome MetaBAT Bin Count * assembled, Genome EukCC Bin Count * assembled, CRISPR Count * assembled, GC Count * assembled, GC * assembled, Coding Base Count * assembled, Coding Base Count % * assembled, CDS Count * assembled, CDS % * assembled, siteID, plotID, nlcdClass, decimalLatitude, decimalLongitude, elevation, collectionDate, horizon, soilTemp, d15N, organicd13C, nitrogenPercent, organicCPercent, CNratio, soilInWaterpH, soilInCaClpH.
## Warning: Removed 32 rows containing missing values or values outside the scale range
## (`geom_point_g_gtree()`).